#First import the necessary packages
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from pathlib import Path
import seaborn as sns
from scipy import stats
import os
import numpy as np
import statsmodels.formula.api as sm
import statsmodels.api as sm2
%matplotlib inline
import scipy.stats as stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import sem
from scipy.stats import t
from math import sqrt
import statsmodels as sm1
#Establish the database connection
conn = sqlite3.Connection('database1.sqlite')
c = conn.cursor()
#making a string to add all the bookies to the query
bookies = "d.B365H, d.B365D, d.B365A, d.BWH, d.BWD, d.BWA, d.IWH, d.IWD, d.IWA, d.LBH, d.LBD, d.LBA, d.PSH, d.PSD, d.PSA, d.WHH, d.WHD, d.WHA, d.SJH, d.SJD, d.SJA, d.VCH, d.VCD, d.VCA, d.GBH, d.GBD, d.GBA, d.BSH, d.BSD, d.BSA"
bookies = bookies.replace('d','m')
#main query gets win loss data and bookie odds
q=("""
SELECT m.home_team_goal, m.away_team_goal, m.home_team_api_id, {} FROM
Match m
""".format(bookies))
df = pd.read_sql_query(q, conn)
df_copy = pd.read_sql_query(q, conn)
#Sets up columns to see who won or lost or draw
df['HomeWin']=df.home_team_goal>df.away_team_goal
df['AwayWin']=df.away_team_goal>df.home_team_goal
df['Draw']=df.home_team_goal==df.away_team_goal
#Convert odds to implied probability
for column in df.loc[:,'B365H':'BSA']:
df[column]= df[column]/(1+df[column])
#reorder the column to make more sense
cols = list(df.columns)
len(cols)
cols = cols[0:2] + cols[-3:] + cols[2:33]
df = df[cols]
#code from: https://stackoverflow.com/questions/53141240/pandas-how-to-swap-or-reorder-columns
#checking for null values
df.isna().sum()
home_team_goal 0 away_team_goal 0 HomeWin 0 AwayWin 0 Draw 0 home_team_api_id 0 B365H 3387 B365D 3387 B365A 3387 BWH 3404 BWD 3404 BWA 3404 IWH 3459 IWD 3459 IWA 3459 LBH 3423 LBD 3423 LBA 3423 PSH 14811 PSD 14811 PSA 14811 WHH 3408 WHD 3408 WHA 3408 SJH 8882 SJD 8882 SJA 8882 VCH 3411 VCD 3411 VCA 3411 GBH 11817 GBD 11817 GBA 11817 BSH 11818 BSD 11818 BSA 11818 dtype: int64
#drop columns with over 5000 null value - 22432 total rows
df.drop(['PSA','PSH','PSD','GBH','GBD','GBA','BSH','BSD','BSA','SJH','SJD','SJA'], axis=1, inplace=True)
#check for null values again
df.isna().sum()
home_team_goal 0 away_team_goal 0 HomeWin 0 AwayWin 0 Draw 0 home_team_api_id 0 B365H 3387 B365D 3387 B365A 3387 BWH 3404 BWD 3404 BWA 3404 IWH 3459 IWD 3459 IWA 3459 LBH 3423 LBD 3423 LBA 3423 WHH 3408 WHD 3408 WHA 3408 VCH 3411 VCD 3411 VCA 3411 dtype: int64
#Check the shape of the dataframe
df.shape
(25979, 24)
#Drop any row with a null value
df.dropna(inplace=True)
#Check the size again...still have nearly 23000 of an original 26000 rows, which is good
df.shape
(22432, 24)
#Curious of the probability of a home win across the entire dataframe
round(df.HomeWin.sum()/df.HomeWin.shape[0], 3)
0.459
#Finds the mean across all bookies for Home Wins, Away Wins, and Draw mainly out of curiosity
df['meanH'] = df.loc[:,'B365H':'VCH':3].mean(axis=1)
print("HOME WIN")
print(f'The mean odds a bookie offers for a home win is: {round(df.meanH.mean()/(1-df.meanH.mean()),3)} which translates to an implied probaility of: {round(df.meanH.mean(),3)}')
print(f'The observed probability the home team wins is: {round(df.HomeWin.sum()/df.HomeWin.shape[0],3)}')
print(f'The spread or difference of those means is: {round(df.meanH.mean()-df.HomeWin.sum()/df.HomeWin.shape[0],3)}')
print(' ')
df['meanA'] = df.loc[:,'B365A':'VCA':3].mean(axis=1)
print('AWAY WIN')
print(f'The mean odds a bookie offers for an away win is: {round(df.meanA.mean()/(1-df.meanA.mean()),3)} which translates to an implied probaility of: {round(df.meanA.mean(),3)}')
print(f'The observed probability the away team wins is: {round(df.AwayWin.sum()/df.AwayWin.shape[0],3)}')
print(f'The spread or difference of those means is: {round(df.meanA.mean()-df.AwayWin.sum()/df.AwayWin.shape[0],3)}')
print(' ')
df['meanD'] = df.loc[:,'B365D':'VCD':3].mean(axis=1)
print('DRAW')
print(f'The mean odds a bookie offers for a draw is: {round(df.meanD.mean()/(1-df.meanD.mean()),3)} which translates to an implied probaility of: {round(df.meanD.mean(),3)}')
print(f'The observed probability a draw is: {round(df.Draw.sum()/df.Draw.shape[0],3)}')
print(f'The spread or difference of those means is: {round(df.meanD.mean()-df.Draw.sum()/df.Draw.shape[0],3)}')
HOME WIN The mean odds a bookie offers for a home win is: 2.177 which translates to an implied probaility of: 0.685 The observed probability the home team wins is: 0.459 The spread or difference of those means is: 0.226 AWAY WIN The mean odds a bookie offers for an away win is: 3.372 which translates to an implied probaility of: 0.771 The observed probability the away team wins is: 0.288 The spread or difference of those means is: 0.483 DRAW The mean odds a bookie offers for a draw is: 3.606 which translates to an implied probaility of: 0.783 The observed probability a draw is: 0.253 The spread or difference of those means is: 0.53
...
Ellipsis
.
File "<ipython-input-321-a5d5b61aa8a6>", line 1 . ^ SyntaxError: invalid syntax
#Distplots of each bookie across the three categories of prediction: -home wins, away win, and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
fig, ax = plt.subplots(nrows=5,ncols=4)
fig.set_size_inches(16,12)
i = 0
for column in df.loc[:,'B365H':'VCD'].columns:
sns.distplot(df[column], ax = ax[int(i/4)][i%4])
i+=1
fig.tight_layout()
#code from Harrison
#This is to check and see which columns to transform
#The result of the test is that home win and draw columns should be transformed but not the away win column
#(True means transform improves normality)
def log_transform_test(df):
improved_list = []
for column in df.columns:
pre_transform_stat = stats.normaltest(df[column])
transformed_col = df[column].apply(lambda x: np.log(x))
post_transformed_stat = stats.normaltest(transformed_col)
improved_list.append([column, pre_transform_stat>post_transformed_stat])
return improved_list
log_transform_test(df.loc[:,'B365H':'VCD'])
[['B365H', True], ['B365D', True], ['B365A', False], ['BWH', True], ['BWD', True], ['BWA', False], ['IWH', True], ['IWD', True], ['IWA', False], ['LBH', True], ['LBD', True], ['LBA', False], ['WHH', True], ['WHD', True], ['WHA', False], ['VCH', True], ['VCD', True]]
def log_transform(df):
for column in df.columns:
df[column] = df[column].apply(lambda x: np.log(x))
return df
df_log_transformed = df.copy()
df_log_transformed.loc[:,'B365H':'VCH':3] = log_transform(df_log_transformed.copy().loc[:,'B365H':'VCH':3])
df_log_transformed.loc[:,'B365D':'VCD':3] = log_transform(df_log_transformed.copy().loc[:,'B365D':'VCD':3])
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
fig, ax = plt.subplots(nrows=5,ncols=4)
fig.set_size_inches(16,12)
i = 0
for column in df_log_transformed.loc[:,'B365H':'VCD'].columns:
sns.distplot(df_log_transformed[column], ax = ax[int(i/4)][i%4])
i+=1
fig.tight_layout()
#code from Harrison
#Just out of curiosity creating a pairplot - scatter plots of the relationships between each book with the other
#and within self for all three of the categories, away win, home win, and draw
#This pairplot shows implied probability
#sns.pairplot(df.loc[:,'B365H':'VCD'])
#The following pairplot is of the unconverted wagering odds
#sns.pairplot(df_copy.loc[:,'B365H':'VCD'])
#This creates three new dataframes to hold ttest results
draw_df = df.loc[:,'B365D':'VCD':3].drop(df.index[0:df.shape[0]])
draw_df['Index_'] = df.loc[:,'B365D':'VCD':3].columns
draw_df = draw_df.set_index('Index_')
None
away_win_df = df.loc[:,'B365A':'VCA':3].drop(df.index[0:df.shape[0]])
away_win_df['Index_'] = df.loc[:,'B365A':'VCA':3].columns
away_win_df = away_win_df.set_index('Index_')
None
home_win_df = df.loc[:,'B365H':'VCH':3].drop(df.index[0:df.shape[0]])
home_win_df['Index_'] = df.loc[:,'B365H':'VCH':3].columns
home_win_df = home_win_df.set_index('Index_')
None
#This loops through the bookies to conduct individual one-to-one ttests for
#bookies across similar odds
list_of_dataframes = [home_win_df, away_win_df, draw_df]
for bookie_df in list_of_dataframes:
for bookie_1 in bookie_df:
for index, bookie_2 in enumerate(bookie_df):
x = df[bookie_1]
y = df[bookie_2]
ttest=stats.ttest_ind(x,y)
bookie_df[bookie_1].iloc[index] = ttest[1]
#Deletes repetive values to improve heatmap visualization
for i in range(bookie_df.shape[0]):
for j in range(bookie_df.shape[1]):
if j>=i:
bookie_df.iloc[i,j] = None
#This generates a heatmap of p-values for home wins
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(home_win_df, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Home Win one-to-one P-Values Across Bookies\n Alpha/2 = .025 (.05/2), Two Sided T-Test")
ax.patch.set_alpha(0.5)
ax.set_ylabel('')
ax.set_xlabel('')
#The following bookie relationship show t-test p-values that attempt to reject the null
#hypothesis that they come from the same distribution at a 95% level of confidence,
#most succeed, meaning the various bookie predictions are truly from different distributions.
#In the heatmap below the t-tests that successfully reject the null hypothesis are in dark blue.
#Lighter purple and magenta values denote failure to reject because the p-value is greater
#than .05 It should also be noted that these are individual comparisons between similar metrics
#across individiual bookies, one to another. There is one darker purple square with a p-value of .038 which
#does reject the null hypothesis.
#The following 5 t-tests comparisons failed to reject, meaning they come from the same underlying distribution
#at a 95% statistical confidence level for probability of a home team win:
#B365H to WHH
#B365H to VCH
#BWH to LBH
#BWH to WHH
#IWH to LBH
#key:
#B365H - Bet365
#BWH - Bet and Win
#IWH - Interwetten
#LBH - Ladbrokes
#WHH - William Hill
#VCH - VC Bet
#"H" stands for home win
Text(0.5, 48.5, '')
#This generates a heatmap of p-values for away wins
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(away_win_df, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Away Win one-to-one P-Values Across Bookies\n Alpha/2 = .025 (.05/2), Two Sided T-Test")
ax.patch.set_alpha(.05)
ax.set_ylabel('')
ax.set_xlabel('')
#The following bookie relationship show t-test p-values that attempt to reject the null
#hypothesis that they come from the same distribution at a 95% level of confidence,
#most succeed, meaning the various bookie predictions are truly from different distributions.
#In the heatmap below the t-tests that successfully reject the null hypothesis are in dark blue.
#Purple values denote failure to reject because the p-value is greater
#than .05 It should also be noted that these are individual comparisons between similar metrics
#across individiual bookies, one to another.
#The following 2 t-tests comparisons failed to reject, meaning they come from the same underlying distribution
#at a 95% statistical confidence level for probability of a home team win:
#B365A to VCA
#LBA to WHA
#key:
#B365A - Bet365
#BWA - Bet and Win
#IWA - Interwetten
#LBA - Ladbrokes
#WHA - William Hill
#VCA - VC Bet
#"A" stands for away win
Text(0.5, 48.5, '')
#This generates a heatmap of p-values for draws
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(draw_df, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Draw one-to-one P-Values Across Bookies\n Alpha = .025 (.05/2), Two Sided T-Test")
ax.patch.set_alpha(.05)
ax.set_ylabel('')
ax.set_xlabel('')
#The following bookie relationship show t-test p-values that attempt to reject the null
#hypothesis that they come from the same distribution at a 95% level of confidence,
#and all succeed, meaning the various bookie predictions are truly from different distributions.
#All ttests below reject the null hypothesis and thereby come from different underlying distributions
#key:
#B365D - Bet365
#BWD - Bet and Win
#IWD - Interwetten
#LBD - Ladbrokes
#WHD - William Hill
#VCD - VC Bet
#"D" stands for away win
Text(0.5, 48.5, '')
#Now we conduct one-way anova tests for the bookies in each category,
#home win, away win, and draw
result = stats.f_oneway(df.B365H, df.BWH, df.IWH, df.LBH, df.WHH, df.VCH)
print(' ')
print(f'The p-value for this one way ANOVA test is {result[1]}')
print('Therefore, the null hypothesis that these populations come from the same underlying distrubtion')
print(f"was {('rejected' if result[1]<=.05 else 'not rejected')} at the .95 level of confindence.")
print(' ')
result = stats.f_oneway(df.B365A, df.BWA, df.IWA, df.LBA, df.WHA, df.VCA)
print(f'The p-value for this one way ANOVA test is {result[1]}')
print('Therefore, the null hypothesis that these populations come from the same underlying distrubtion')
print(f"was {('rejected' if result[1]<=.05 else 'not rejected')} at the .95 level of confindence.")
print(' ')
stats.f_oneway(df.B365D, df.BWD, df.IWD, df.LBD, df.WHD, df.VCD)
print(f'The p-value for this one way ANOVA test is {result[1]}')
print('Therefore, the null hypothesis that these populations come from the same underlying distrubtion')
print(f"was {('rejected' if result[1]<=.05 else 'not rejected')} at the .95 level of confindence.")
print(' ')
#For all three of my one-way ANOVA tests, the null hypothesis that the individual distributions
#come from the same underlying distribution was rejected at the 95% level of confidence.
#This means essentially there is no single common distribution that can predict all the bookie's
#behavior for all three categories of odds, home wins, away wins, and draws.
The p-value for this one way ANOVA test is 2.750142525557248e-22 Therefore, the null hypothesis that these populations come from the same underlying distrubtion was rejected at the .95 level of confindence. The p-value for this one way ANOVA test is 3.3042277037498044e-51 Therefore, the null hypothesis that these populations come from the same underlying distrubtion was rejected at the .95 level of confindence. The p-value for this one way ANOVA test is 3.3042277037498044e-51 Therefore, the null hypothesis that these populations come from the same underlying distrubtion was rejected at the .95 level of confindence.
#Two way anova testing:
#home_win_df_anova
hold_list = []
df_list = [home_win_df, away_win_df, draw_df]
home_win_df_anova = home_win_df
away_win_df_anova = away_win_df
draw_df_anova = draw_df
anova_df_list = [home_win_df_anova, away_win_df_anova, draw_df_anova]
match_list = [] #checking to see which database is beign called
for index, dfl_item in enumerate(df_list):
for bookie in anova_df_list[index]:
formula = f'{bookie} ~ ' +' + '.join([bookie for bookie in anova_df_list[index].drop(bookie, axis=1).columns])
if str(dfl_item)==str(away_win_df):
match_list.append([index, "df"])
model = sm.ols(formula, df).fit()
else:
#switch commenting to choose between log transformed or not
#model = sm.ols(formula, df).fit()
model = sm.ols(formula, df_log_transformed).fit()
match_list.append([index, "df_log_transformed"])
aov_table = sm2.stats.anova_lm(model, typ=2)
for bookie_2 in anova_df_list[index].drop(bookie, axis=1).columns:
anova_df_list[index].loc[bookie][bookie_2] = aov_table.loc[bookie_2]['PR(>F)']
#Deletes repetive values to improve heatmap visualization
for i in range(anova_df_list[index].shape[0]):
for j in range(anova_df_list[index].shape[1]):
if j>=i:
anova_df_list[index].iloc[i,j] = None
home_win_df_anova= anova_df_list[0]
away_win_df_anova = anova_df_list[1]
draw_df_anova = anova_df_list[2]
#match_list
#This generates a heatmap for home win two way anova tests
#all distributions reject the null hypothesis when each is compared to the rest of the others
#this means they do not come from the same underlying distribution
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(home_win_df_anova, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Home Win one-to-many Two Way Anova P-Values Across Bookies\n Alpha/2 = .025 (.05/2), two sided T-Test")
ax.set_ylabel('')
ax.set_xlabel('')
Text(0.5, 48.5, '')
#This generates a heatmap for away win two way anova tests
#all distributions reject the null hypothesis when each is compared to the rest of the others
#this means they do not come from the same underlying distribution
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(away_win_df_anova, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Away Win one-to-many Two Way Anova P-Values Across Bookies\n Alpha = = .025 (.05/2), two sided T-Test")
ax.set_ylabel('')
ax.set_xlabel('')
Text(0.5, 48.5, '')
#This generates a heatmap for draw two way anova test
#Except for all distributions reject the null hypothesis when each is compared to the rest of the others
#this means they do not come from the same underlying distribution
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(draw_df_anova, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("Draw one-to-many Two Way Anova P-Values Across Bookies\n Alpha = = .025 (.05/2), two sided T-Test")
ax.set_ylabel('')
ax.set_xlabel('')
Text(0.5, 48.5, '')
#Load up information from Match and Player_Attributes Databases
ath_df = sql_query_to_df(c, """SELECT * FROM Player_Attributes""")
#Checking for null values - there are none
ath_df.isna().sum().sum()
47301
ath_df.dropna(inplace=True)
ath_df.drop_duplicates(subset='player_fifa_api_id', inplace=True)
ath_df.drop_duplicates(subset='player_api_id', inplace=True)
#Here are the athletic abilities we will look at:
ath_list = ['acceleration','sprint_speed','agility','reactions',
'balance', 'jumping', 'stamina', 'strength']
#here are the soccer skills we will look at:
skills_list = ath_df.loc[:,'potential': 'sliding_tackle'].drop(['acceleration','sprint_speed','agility','reactions',
'balance', 'jumping', 'stamina', 'strength','preferred_foot',
'attacking_work_rate', 'defensive_work_rate','vision','potential','shot_power','volleys','curve'], axis=1).columns
#instantiate a dataframe with zeros to hold pvalues
qualities_df = pd.DataFrame(index=[skills_list], columns=[ath_list])
for col in qualities_df.columns:
qualities_df[col].values[:] = 0
for item in log_transform_test(ath_df[skills_list]):
print(item)
#False indicates a log transformation will not improve normality
#We decided not to log transform due to 10/16 of these column tests resulting in false
['crossing', False] ['finishing', True] ['heading_accuracy', False] ['short_passing', False] ['dribbling', False] ['free_kick_accuracy', False] ['long_passing', False] ['ball_control', False] ['long_shots', False] ['aggression', False] ['interceptions', True] ['positioning', False] ['penalties', False] ['marking', True] ['standing_tackle', True] ['sliding_tackle', True]
for item in log_transform_test(ath_df[ath_list]):
print(item)
['acceleration', False] ['sprint_speed', False] ['agility', False] ['reactions', False] ['balance', False] ['jumping', False] ['stamina', False] ['strength', False]
#This will populate two df's, one of pvalues and one of the difference between means
#function takes in the main_df, a p-values holder df, the percentage desired for split (must be 75, 50, or 25),
#a list of athletic measures and a list of osccer skills
def ath_to_skill_ttest(ath_df, qualities_df, top_percent, ath_list, skills_list):
#setting up empty dictionary
dict_means = {}
#reset values to zero
for col in qualities_df.columns:
qualities_df[col].values[:] = 0
#creating a string with % after the percent
#this is needed for grabbing values off series.describe()
top_percent_string = str(top_percent)+'%'
bottom_percent_string = str(100-top_percent)+'%'
#create columns classifying players into upper and lower groups for each athletic ability
for ath in ath_list:
ath_df[ath+'_top'+top_percent_string]=ath_df[ath]>= ath_df[ath].describe()[top_percent_string]
ath_df[ath+'_bottom'+bottom_percent_string]=ath_df[ath]< ath_df[ath].describe()[top_percent_string]
#this is a dictionary that hold the 25/75 splits
dict_skill_split = {}
for i, ath in enumerate(ath_list):
for j, skill in enumerate(skills_list):
#Creates a dictionary entry for each ath/skill combo with an array the skill of top quartile of the ath
#and array of the skill of the bottom 75
dict_skill_split.update({ath+skill : [ath_df.loc[ath_df[ath+'_top'+top_percent_string]][skill],
ath_df.loc[ath_df[ath+'_bottom'+bottom_percent_string]][skill]]})
dict_means.update({ath+skill : ath_df.loc[ath_df[ath+'_top'+top_percent_string]][skill].mean()-
ath_df.loc[ath_df[ath+'_bottom'+bottom_percent_string]][skill].mean()})
#This loops through the qualities to conduct individual one-to-one ttests
fail_reject_list = []
reject_list = []
mean_df = qualities_df.copy()
for a, ath in enumerate(ath_list):
for s, skill in enumerate(skills_list):
x = dict_skill_split[ath+skill][0] #skill level of top of the ath group
y = dict_skill_split[ath+skill][1] #skill level of bottom of the ath group
ttest=stats.ttest_ind(x,y) #two sided ttest
#checking for very small numbers and assigning to zero
if ttest[1]<float('1.0e-7'):
qualities_df.iloc[s, a] = 0.0
else:
qualities_df.iloc[s, a] = ttest[1]
#checking to see if null hypothesis has been rejected
if (ttest[1] > .025):
fail_reject_list.append([ath,skill, dict_skill_split[ath+skill][0],dict_skill_split[ath+skill][1]])
mean_df.iloc[s, a] = dict_means[ath+skill]
else:
reject_list.append([ath,skill, dict_skill_split[ath+skill][0],dict_skill_split[ath+skill][1]])
mean_df.iloc[s, a] = dict_means[ath+skill]
pvalues_df = qualities_df
return pvalues_df, mean_df, reject_list, fail_reject_list
pvalues_df_7525, mean_df_7525, reject_list_7525, fail_reject_list_7525 = ath_to_skill_ttest(ath_df, qualities_df, 75, ath_list, skills_list)
#I was having issues with any earlier dataframe that had been assigned by this function getting reassigned
#every time I ran it so for now I have solved it by forcing pandas to make copies
df_7525 = pvalues_df_7525.copy()
mdf_7525 = mean_df_7525.copy()
pvalues_df_5050, mean_df_5050, reject_list_5050, fail_reject_list_5050 = ath_to_skill_ttest(ath_df, qualities_df, 50, ath_list, skills_list)
df_5050 = pvalues_df_5050.copy()
mdf_5050 = mean_df_5050.copy()
pvalues_df_2575, mean_df_2575, reject_list_2575, fail_reject_list_2575 = ath_to_skill_ttest(ath_df, qualities_df, 25, ath_list, skills_list)
df_2575 = pvalues_df_2575.copy()
mdf_2575 = mean_df_2575.copy()
#conveting values from objects to floats so they can be plotted
df_list = [df_7525, df_5050, df_2575, mdf_7525, mdf_5050, mdf_2575]
for dff in df_list:
for column in dff.columns:
dff[column] = pd.to_numeric(dff[column])
# 75/25 Split Seaborn Heatmap
sns.set(font_scale=1.3)
fig, (ax1, ax2) = plt.subplots(1,2,figsize= (15,20))
sns.heatmap(df_7525, vmin=0, vmax=.53, annot = True, ax = ax1, square = True, annot_kws={"fontsize":11}, cbar= False)
sns.heatmap(mdf_7525, vmin=-10, vmax=10, annot = True, cmap='Blues', ax = ax2, square = True, annot_kws={"fontsize":11}, cbar =False)
#ax.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
for tick in ax1.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
for tick in ax1.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax2.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
# Create offset transform by 5 points in x direction
dx = -15/72.; dy = 0/72.
offset = matplotlib.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
# apply offset transform to all x ticklabels.
for label in ax1.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
for label in ax2.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
ax1.set_title("75/25 Split on Athletic Measures for Each Skill\n Alpha = .05, Two Sided T-Test P-Values")
ax2.set_title("75/25 Split Difference Between Means")
ax1.set_ylabel('Skill')
ax2.set_ylabel(' ')
ax1.set_xlabel('Athletic Measure')
ax2.set_xlabel('Athletic Measure')
fig.tight_layout()
#https://stackoverflow.com/questions/28615887/how-to-move-a-ticks-label-in-matplotlib
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(font_scale=1.3)
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("4 Examples that Failed to Reject 75/25 Split \n (lower grouping black)", fontsize=14)
i = 0
p = fail_reject_list_7525[:4]
for q in p:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="black", ax = ax[int(i/2)][i%2]).set_title(f"75/25 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 75/25 Split AND Lower Group (green) has at least 16% Higher Mean (highest 4 mean differences)", fontsize=14)
i = 0
for q in reject_list_7525:
if q[3].mean()/q[2].mean() >= 1.16 :
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="green", ax = ax[int(i/2)][i%2]).set_title(f"75/25 Split of {q[0].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="husl", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 75/25 Split AND Upper Group (pink) has at least 34% Greater Mean (top 4 highest mean differences)", fontsize=14)
i = 0
for q in reject_list_7525:
if q[2].mean()/q[3].mean() >= 1.34:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="grey", ax = ax[int(i/2)][i%2]).set_title(f"75/25 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {round((round(q[2].mean()/q[3].mean(),2)-1)*100,2)}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#50/50 Split Seaborn Heatmap
sns.set(font_scale=1.3)
fig, (ax1, ax2) = plt.subplots(1,2,figsize= (15,20))
sns.heatmap(df_5050, vmin=0, vmax=.53, annot = True, ax = ax1, square = True, annot_kws={"fontsize":11}, cbar= False)
sns.heatmap(mdf_5050, vmin=-10, vmax=10, annot = True, cmap='Blues', ax = ax2, square = True, annot_kws={"fontsize":11}, cbar =False)
#ax.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
for tick in ax1.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
for tick in ax1.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax2.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
# Create offset transform by 5 points in x direction
dx = -15/72.; dy = 0/72.
offset = matplotlib.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
# apply offset transform to all x ticklabels.
for label in ax1.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
for label in ax2.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
ax1.set_title("50/50 Split on Athletic Measures for Each Skill\n Alpha = .05, Two Sided T-Test P-Values")
ax2.set_title("50/50 Split Difference Between Means")
ax1.set_ylabel('Skill')
ax2.set_ylabel(' ')
ax1.set_xlabel('Athletic Measure')
ax2.set_xlabel('Athletic Measure')
fig.tight_layout()
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Four Examples that Failed to Reject 50/50 Split \n (lower group black)", fontsize=14)
i = 0
p = fail_reject_list_5050[:4]
for q in p:
q[3].name = " "
sns.distplot(q[2], label = "sdf", ax = ax[int(i/2)][i%2]).set_title(f"Upper 50% of {q[0].title()} on {q[1].title()}")
sns.distplot(q[3], color = "black", ax = ax[int(i/2)][i%2]).set_title(f"50/50 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 50/50 Split AND Lower Group (green) has at least 8% Higher Mean (highest 4 mean differences)", fontsize=14)
i = 0
for q in reject_list_5050:
if q[3].mean()/q[2].mean() >= 1.08:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="green", ax = ax[int(i/2)][i%2]).set_title(f"50/50 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="husl", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 50/50 Split AND Upper Group (pink) has at least 41% Greater Mean (top 4 highest mean differences)", fontsize=14)
i = 0
for q in reject_list_5050:
if q[2].mean()/q[3].mean() >= 1.41:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="grey", ax = ax[int(i/2)][i%2]).set_title(f"50/50 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {round((round(q[2].mean()/q[3].mean(),2)-1)*100,2)}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#25/75 Split Seaborn Heatmap
sns.set(font_scale=1.3)
fig, (ax1, ax2) = plt.subplots(1,2,figsize= (15,20))
sns.heatmap(df_2575, vmin=0, vmax=.53, annot = True, ax = ax1, square = True, annot_kws={"fontsize":11}, cbar= False)
sns.heatmap(mdf_2575, vmin=-10, vmax=10, annot = True, cmap='Blues', ax = ax2, square = True, annot_kws={"fontsize":11}, cbar =False)
#ax.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
for tick in ax1.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
for tick in ax1.yaxis.get_major_ticks():
tick.label.set_fontsize(14)
for tick in ax2.xaxis.get_major_ticks():
tick.label.set_fontsize(14)
tick.label.set_rotation(50)
# Create offset transform by 5 points in x direction
dx = -15/72.; dy = 0/72.
offset = matplotlib.transforms.ScaledTranslation(dx, dy, fig.dpi_scale_trans)
# apply offset transform to all x ticklabels.
for label in ax1.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
for label in ax2.xaxis.get_majorticklabels():
label.set_transform(label.get_transform() + offset)
ax1.set_title("25/75 Split on Athletic Measures for Each Skill\n Alpha = .05, Two Sided T-Test P-Values")
ax2.set_title("25/75 Split Difference Between Means")
ax1.set_ylabel('Skill')
ax2.set_ylabel(' ')
ax1.set_xlabel('Athletic Measure')
ax2.set_xlabel('Athletic Measure')
fig.tight_layout()
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Four examples of failed to Reject 25/75 Split", fontsize=14)
i = 0
for q in fail_reject_list_2575:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="black", ax = ax[int(i/2)][i%2]).set_title(f"25/75 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="dark", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 25/75 Split AND Lower Group (green) has at least 6.5% Higher Mean (top 4 highest mean differences)", fontsize=14)
i = 0
for q in reject_list_2575:
if q[3].mean()/q[2].mean() >= 1.065:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="green", ax = ax[int(i/2)][i%2]).set_title(f"25/75 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Distplots of each bookie across the three categories of prediction AFTER log transformation ON -home wins and draw
#these plots visually appear fairly normal - though further testing for normality could still be done
sns.set(style="whitegrid", palette="husl", color_codes=True)
fig, ax = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(16,12)
fig.suptitle("Reject 25/75 Split AND Upper Group (pink) has at least 61% Greater Mean (top 4 highest mean differences)", fontsize=14)
i = 0
for q in reject_list_2575:
if q[2].mean()/q[3].mean() >= 1.61:
q[3].name = " "
sns.distplot(q[2], ax = ax[int(i/2)][i%2]).set_title(f" ")
sns.distplot(q[3], color="grey", ax = ax[int(i/2)][i%2]).set_title(f"25/75 Split of {q[0].title()} viewed on {q[1].title()}\n Difference Between Means of {abs(round((round(q[2].mean()/q[3].mean(),2)-1)*100,2))}%")
i+=1
fig.tight_layout(pad=4.0, w_pad=0.5, h_pad=1)
#code from Harrison
#Works consulted:
#https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/
#https://www.sagepub.com/sites/default/files/upm-binaries/33663_Chapter4.pdf
#https://www.kaggle.com/efezinoerome/analyzing-soccer-data
#http://www.statstutor.ac.uk/resources/uploaded/tutorsquickguidetostatistics.pdf
#https://math.stackexchange.com/questions/2173385/semantics-binomial-vs-binary
# https://towardsdatascience.com/hypothesis-testing-in-the-northwind-dataset-using-anova-db3ab16b5eba
# https://www.quora.com/What-does-a-high-F-value-usually-mean-and-why
# def connect_to_sql_database(database_file_name):
# import sqlite3
# connection = sqlite3.connect(database_file_name)
# cursor = connection.cursor()
# return connection, cursor
# conn, c = connect_to_sql_database('database1.sqlite')
# def sql_query_to_df(cursor, sql_query):
# import pandas as pd
# c.execute(sql_query)
# df = pd.DataFrame(c.fetchall())
# df.columns = [x[0] for x in c.description]
# return df
#Load up information from Match and Player_Attributes Databases
df2 = sql_query_to_df(c, """SELECT * FROM Match""")
#I need to utilize the away_team_api_id column, so I reload the Match table here
print(df2.columns)
df1 = sql_query_to_df(c, """SELECT * FROM Player_Attributes""")
print(df1.columns)
Index(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
'match_api_id', 'home_team_api_id', 'away_team_api_id',
'home_team_goal',
...
'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA'],
dtype='object', length=115)
Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
'potential', 'preferred_foot', 'attacking_work_rate',
'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
'gk_reflexes'],
dtype='object')
df2['HomeWin']=df2.home_team_goal>df2.away_team_goal
df2['AwayWin']=df2.away_team_goal>df2.home_team_goal
#As shown below, the effect size of playing home/away appears to be substantial:
#Percentage of wins at home (across the dataset)
print(df2.HomeWin.sum()/df.HomeWin.shape[0])
#Percentage of wins across the dataset when away
print(df2.AwayWin.sum()/df.AwayWin.shape[0])
#This will be confirmed with statistical testing.
0.53125 0.3328281027104137
def homewinbinary(df):
win_dict={}
games_home = df2.groupby(df.home_team_api_id) #slice by home id
games_away = df2.groupby(df.away_team_api_id) #slice by away id
team_ids = list(games_home.groups.keys()) #get individual team ids
#calculate and store home win percentages
for team in team_ids:
x=games_home.get_group(team) #grab home wins
y=games_away.get_group(team) #grab everything else
home_per=x.HomeWin.sum()/len(x.HomeWin) #calculate Home win percentage
else_per=(1 - home_per) #calculate complement of Home win percentage
win_dict[team]=[home_per, else_per] #store
win_df=pd.DataFrame(win_dict).T #Transpose DF to have teams as rows
return win_df
binary = homewinbinary(df2)
binary.head()
| 0 | 1 | |
|---|---|---|
| 1601 | 0.450000 | 0.550000 |
| 1773 | 0.355556 | 0.644444 |
| 1957 | 0.525000 | 0.475000 |
| 2033 | 0.253333 | 0.746667 |
| 2182 | 0.616667 | 0.383333 |
#Plotting the distributions for Home Win Binary metric
plt.figure(figsize=(8,5))
for skill in binary.columns:
sns.distplot((binary[skill]), hist_kws=dict(alpha=0.2))
plt.xlabel('Home Wins (Orange) versus all other outcomes (Blue)')
#Run a dependent Ttest with Stats Model
x = binary[0] #All other results
y = binary[1] #Wins at home
ttest=stats.ttest_rel(x,y)
print(' ')
print(f'The p-value for this dependent T-test is {ttest[1]}')
print('Therefore, the null hypothesis that playing at home does not have a statistically significant effect on winning')
print(f"was {('rejected' if ttest[1]<=.05 else 'not rejected')} at the .95 level of confidence.")
print(' ')
The p-value for this dependent T-test is 9.70767116964689e-25 Therefore, the null hypothesis that playing at home does not have a statistically significant effect on winning was rejected at the .95 level of confidence.
def Cohen_d(group1, group2):
diff = group1.mean() - group2.mean()
n1, n2 = len(group1), len(group2)
var1 = group1.var()
var2 = group2.var()
# Calculate the pooled variance
pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
# Calculate Cohen's d statistic
d = diff / np.sqrt(pooled_var)
return d
#The two groups under investigation here have given a Cohen's D greater than 0.8.
#Therefore, playing a game at home is considered to have large effect size on your odds of victory
Cohen_d(y, x)
1.3022709371161278
#Correlation heatmap of player attributes
corr2 = df1.select_dtypes(include =['float64','int64']).\
loc[:,df1.select_dtypes(include =['float64','int64']).columns[3:]].corr()
#Deletes repetive values to improve heatmap visualization
for i in range(corr2.shape[0]):
for j in range(corr2.shape[1]):
if j>=i:
corr2.iloc[i,j] = None
#Heatmap of Pearson Correlation Coefficients for all Player Attributes
fig2,ax2 = plt.subplots(nrows = 1,ncols = 1)
fig2.set_size_inches(w=24,h=24)
sns.heatmap(corr2,annot = True,linewidths=0.5,ax = ax2)
<matplotlib.axes._subplots.AxesSubplot at 0x129b1acf8>
#Distribution Plots by Variable
fig6, ax6 = plt.subplots(nrows=5,ncols=7)
fig6.set_size_inches(16,12)
foo=df1.dropna()
a=foo.select_dtypes(include=['float64']).columns[3:].tolist()
i=0
for j in a:
b = foo[j]
sns.distplot(b, ax = ax6[int(i/7)][i%7])
i=i+1
fig6.tight_layout()
# Check for missing values
df1[['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']].isna().sum()
gk_diving 836 gk_handling 836 gk_kicking 836 gk_positioning 836 gk_reflexes 836 dtype: int64
# Get rid of missing values
df1c = df1[['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']].dropna()
#Check again
df1c[['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']].isna().sum()
gk_diving 0 gk_handling 0 gk_kicking 0 gk_positioning 0 gk_reflexes 0 dtype: int64
#ANOVA performed iteratively across three positively-correlated metrics
sm1.api.formula.ols
#I repeat my data cleaning technique from before
df1test1 = df1d[['strength', 'sprint_speed', 'acceleration']]
df1test1=df1test1.dropna()
for col in df1test1:
formula = f'{col} ~ ' +' + '.join([column for column in df1test1.drop(col, axis=1).columns])
print(formula)
model = sm1.api.formula.ols(formula, df1test1).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)
strength ~ sprint_speed + acceleration
sum_sq df F PR(>F)
sprint_speed 9.251548e+05 1.0 6880.867712 0.0
acceleration 1.757729e+06 1.0 13073.161155 0.0
Residual 2.462363e+07 183139.0 NaN NaN
sprint_speed ~ strength + acceleration
sum_sq df F PR(>F)
strength 1.905996e+05 1.0 6880.867712 0.0
acceleration 2.352731e+07 1.0 849363.366244 0.0
Residual 5.072939e+06 183139.0 NaN NaN
acceleration ~ strength + sprint_speed
sum_sq df F PR(>F)
strength 3.741563e+05 1.0 13073.161155 0.0
sprint_speed 2.430894e+07 1.0 849363.366244 0.0
Residual 5.241473e+06 183139.0 NaN NaN
#ANOVA performed iteratively across the three most negatively correlated metrics
df1test2 = df1d[['gk_diving', 'gk_reflexes', 'ball_control']]
df1test2=df1test2.dropna()
for col in df1test2:
formula = f'{col} ~ ' +' + '.join([column for column in df1test2.drop(col, axis=1).columns])
print(formula)
model = sm1.api.formula.ols(formula, df1test2).fit()
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)
gk_diving ~ gk_reflexes + ball_control
sum_sq df F PR(>F)
gk_reflexes 1.725937e+07 1.0 506868.848760 0.0
ball_control 2.956282e+05 1.0 8681.933974 0.0
Residual 6.236059e+06 183139.0 NaN NaN
gk_reflexes ~ gk_diving + ball_control
sum_sq df F PR(>F)
gk_diving 1.812554e+07 1.0 506868.848760 0.0
ball_control 2.428963e+05 1.0 6792.434757 0.0
Residual 6.549020e+06 183139.0 NaN NaN
ball_control ~ gk_diving + gk_reflexes
sum_sq df F PR(>F)
gk_diving 8.719733e+05 1.0 8681.933974 0.0
gk_reflexes 6.822007e+05 1.0 6792.434757 0.0
Residual 1.839363e+07 183139.0 NaN NaN
#Although high F-values will lead to low P-values and generally indicate a good predictor of the response,
#these F-values are so large as to reduce the P-value to 0, and don't offer insight here.
#Each separate Goalie Metric before it was log-transformed
sns.pairplot(df1c)
<seaborn.axisgrid.PairGrid at 0x1c5df21cc0>
# Log-transformation
df1c_log = df1c.copy()
df1c_log[['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']] = \
np.log(df1[['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']])
#Each separate Goalie Metric log-transformed
sns.pairplot(df1c_log)
<seaborn.axisgrid.PairGrid at 0x122d99048>
#Pair each Goal-keeping metric with each other one in an independent Ttest
ttest_result_dict = {}
for skill in df1c.columns:
for skill_2 in df1c.columns:
x = df1c[skill]
y = df1c[skill_2]
ttest = stats.ttest_ind(x,y)
ttest_name = skill+' and ' + skill_2
ttest_result_dict.update({ttest_name : ttest})
#Pair each log-transformed Goal-keeping metric with each other one in an independent Ttest
ttest_result_dict1 = {}
for skill in df1c_log.columns:
for skill_2 in df1c_log.columns:
x = df1c_log[skill]
y = df1c_log[skill_2]
ttest = stats.ttest_ind(x,y)
ttest_name = skill+' and ' + skill_2
ttest_result_dict1.update({ttest_name : ttest})
# Create list of statistical values
ttest_list = list(ttest_result_dict.values())
ttest_list1 = list(ttest_result_dict1.values())
#Created a list of only p-values
p = [ttest_list[i][1] for i in range(len(ttest_list))]
p1 = [ttest_list1[i][1] for i in range(len(ttest_list1))]
#Sliced up list into appropriate sub-lists for each column
p_diving, p_handling, p_kicking, p_positioning, p_reflexes = p[0:5], p[5:10], p[10:15], p[15:20], p[20:25]
p_vals = [p_diving, p_handling, p_kicking, p_positioning, p_reflexes]
p_diving, p_handling, p_kicking, p_positioning, p_reflexes = p1[0:5], p1[5:10], p1[10:15], p1[15:20], p1[20:25]
p_valslog = [p_diving, p_handling, p_kicking, p_positioning, p_reflexes]
# Create empty dataframe out of goal-keeper metrics
goalie_df = df1c.loc[:,'gk_diving':'gk_reflexes'].drop(df1c.index[0:foo2.shape[0]])
goalie_df['Index_'] = df1c.loc[:,'gk_diving':'gk_reflexes'].columns
goalie_df = goalie_df.set_index('Index_')
None
# Create empty dataframe out of log-transformed goal-keeper metrics
goalie_dflog = df1c_log.loc[:,'gk_diving':'gk_reflexes'].drop(df1c_log.index[0:df1c_log.shape[0]])
goalie_dflog['Index_'] = df1c_log.loc[:,'gk_diving':'gk_reflexes'].columns
goalie_dflog = goalie_dflog.set_index('Index_')
None
#Set columns for normal and log-transformed equal to respective p_values:
#index for p_vals
p_val = 0
#looping through columns
for column in goalie_df.columns:
goalie_df[column] = p_vals[p_val]
#increasing index
p_val += 1
#repeat for log values
p_val1 = 0
for column in goalie_dflog.columns:
goalie_dflog[column] = p_valslog[p_val1]
p_val1 += 1
#Delete repetive values to improve heatmap visualization
for i in range(goalie_df.shape[0]):
for j in range(goalie_df.shape[1]):
if j>=i:
goalie_df.iloc[i,j] = None
goalie_df
#Delete repetive values to improve heatmap visualization
for i in range(goalie_dflog.shape[0]):
for j in range(goalie_dflog.shape[1]):
if j>=i:
goalie_dflog.iloc[i,j] = None
goalie_dflog
#Generate a heatmap of p-values
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(goalie_df, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("One-to-one Goalie Metrics \n Alpha = .05, one-sided independent T-Test")
ax.patch.set_alpha(0.5)
ax.set_ylabel('')
ax.set_xlabel('')
for item in ttest_result_dict:
print(' ')
print(f'The p-value for this independent T-test is {ttest_result_dict[item][1]}')
print("Therefore, the null hypothesis that FIFA's goalkeeper metrics %s are independent of one another"%(item))
print(" was {result} at the .95 level of confidence.".format(result ='rejected' if ttest_result_dict[item][1]
<=.05 else 'not rejected'))
print(' ')
#Generate a heatmap of log-transformed p-values
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize= (12,8))
sns.heatmap(goalie_dflog, vmin=0, vmax=.38, annot = True, ax = ax)
ax.set_title("One-to-one Log Normalized Goalie Metrics \n Alpha = .05, one-sided independent T-Test")
ax.patch.set_alpha(0.5)
ax.set_ylabel('')
ax.set_xlabel('')
for item in ttest_result_dict1:
print(' ')
print(f'The p-value for this independent T-test is {ttest_result_dict1[item][1]}')
print("Therefore, the null hypothesis that FIFA's log-transformed goalkeeper metrics of")
print("%s are independent of one another"%(item))
print("was {result} at the .95 level of confidence.".format(result ='rejected' if ttest_result_dict1[item][1]
<=.05 else 'not rejected'))
print(' ')